Big Lasso

## Warning: package 'rmdformats' was built under R version 4.0.3
# BiocManager::install('mixOmics')
library(mixOmics)

PCA

Principal Component Analysis, or PCA, is a popular technique applicable to different types such as dimensionality reduction, data compression, feature extraction, and visualization.

Basically, it is used to project a dataset from many correlated coordinates onto fewer uncorrelated coordinates called principal components while still retaining most of the variability present in the data.

###########################################
################### PCA ###################
###########################################

library(bigstatsr)
set.seed(1)
X <- big_attachExtdata()
n <- nrow(X)
X[1:2,1:3]
     [,1] [,2] [,3]
[1,]    2    2    0
[2,]    1    2    1
dim(X)
[1]  517 4542
# We only use only half of the data
ind <- sort(sample(n, n/2))
test <- big_SVD(X, fun.scaling = big_scale(),
                ind.row = ind)
str(test)
List of 5
 $ d     : num [1:10] 178.5 114.5 91 87.1 86.3 ...
 $ u     : num [1:258, 1:10] -0.1092 -0.0928 -0.0806 -0.0796 -0.1028 ...
 $ v     : num [1:4542, 1:10] 0.00607 0.00739 0.02921 -0.01283 0.01473 ...
 $ center: num [1:4542] 1.34 1.63 1.51 1.64 1.09 ...
 $ scale : num [1:4542] 0.665 0.551 0.631 0.55 0.708 ...
 - attr(*, "class")= chr "big_SVD"
# we project the sample on a lower dimensional space to see any
# structure
plot(test$u)

# A more realistic projection based on the scores
scores <- test$u %*% diag(test$d)
plot(scores[,2]~scores[,1])


# using the classical pca from R
pca <- prcomp(X[ind, ], center = TRUE, scale. = TRUE)
# same scaling
all.equal(test$center, pca$center)
[1] TRUE
all.equal(test$scale, pca$scale)
[1] TRUE
# projecting on new data
ind2 <- setdiff(rows_along(X), ind)
scores.test2 <- predict(test, X, ind.row = ind2)

plot(scores[,2]~scores[,1])
points(scores.test2[,2]~scores.test2[,1],col="red")

# We start by reading an image and we perform SVDs on this image.
if (!"jpeg" %in% installed.packages())
  install.packages("jpeg")
# Read image file into an array with three channels
# (Red-Green-Blue, RGB)
ivy <- jpeg::readJPEG("Ivy.jpg")
r <- ivy[, , 1] ; g <- ivy[, , 2] ; b <- ivy[, , 3]
# Performs full SVD of each channel
ivy.r.svd <- svd(r) ; ivy.g.svd <- svd(g) ;
ivy.b.svd <- svd(b)
rgb.svds <- list(ivy.r.svd, ivy.g.svd, ivy.b.svd)

# These two functions will be needed to display an image stored in an
# RGB array:
# Function to display an image stored in an RGB array
plot.image <- function(pic, main = "") {
  h <- dim(pic)[1] ; w <- dim(pic)[2]
  plot(x = c(0, h), y = c(0, w), type = "n", xlab = "",
       ylab = "", main = main)
  rasterImage(pic, 0, 0, h, w)
}

compress.image <- function(rgb.svds, nb.comp) {
  # nb.comp (number of components) should be less than min(dim(img[,,1])),
  # i.e., 170 here
  svd.lower.dim <- lapply(rgb.svds, function(i)
    list(d = i$d[1:nb.comp],
         u = i$u[, 1:nb.comp],
         v = i$v[, 1:nb.comp]))
  img <- sapply(svd.lower.dim, function(i) {
    img.compressed <- i$u %*% diag(i$d) %*% t(i$v)
  }, simplify = 'array')
  img[img < 0] <- 0
  img[img > 1] <- 1
  return(list(img = img, svd.reduced = svd.lower.dim))
}

par(mfrow = c(1, 2))
plot.image(ivy, "Original image")
p <- 20 ; plot.image(compress.image(rgb.svds, p)$img,
                     paste("SVD with", p, "components"))

object.size(rgb.svds) # Original image
19691960 bytes
object.size(compress.image(rgb.svds, p)$svd.reduced)
653720 bytes
# if (!requireNamespace("BiocManager", quietly = TRUE))
#   install.packages("BiocManager")
# BiocManager::install("mixOmics", version = "3.8")
library(mixOmics)

data(liver.toxicity)
X <- liver.toxicity$gene

MyResult.pca <- pca(X) # 1 Run the method
plotIndiv(MyResult.pca) # 2 Plot the samples

plotVar(MyResult.pca) # 3 Plot the variables

# samples
plotIndiv(MyResult.pca)

# variables
plotVar(MyResult.pca)

plotIndiv(MyResult.pca,
          group = liver.toxicity$treatment$Dose.Group,
          legend = TRUE)

# Customize plots: two factors displayed
plotIndiv(MyResult.pca, ind.names = FALSE,
          group = liver.toxicity$treatment$Dose.Group,
          pch = as.factor(liver.toxicity$treatment$Time.Group),
          legend = TRUE, title = 'Liver toxicity: genes, PCA comp 1 - 2',
          legend.title = 'Dose', legend.title.pch = 'Exposure')

# second PCA with 3 components:
MyResult.pca2 <- pca(X, ncomp = 3)
plotIndiv(MyResult.pca2, comp = c(1,3), legend = TRUE,
          group = liver.toxicity$treatment$Time.Group,
          title = 'Multidrug transporter, PCA comp 1 - 3')

# Here, the 3rd component on the y-axis clearly highlights a time of
# exposure effect.

# Amount of variance explained and choice of number of
# components
MyResult.pca3 <- pca(X, ncomp = 10)
plot(MyResult.pca3)


# We can also have a look at the variable coefficients in each
# component with the loading vectors.
# a minimal example
plotLoadings(MyResult.pca)

# a customized example to only show the top 100 genes
# and their gene name
plotLoadings(MyResult.pca, ndisplay = 100,
             name.var = liver.toxicity$gene.ID[, "geneBank"],
             size.name = rel(0.3))

plotIndiv(MyResult.pca2,
          group = liver.toxicity$treatment$Dose.Group,
          style="3d",legend = TRUE,
          title = 'Liver toxicity: genes, PCA comp 1 - 2 - 3')

PLS

PLS is a family of multivariate statistical techniques based on dimension reduction

#######################################
####################### PLS ###########
#######################################

# We first set up the data as X expression matrix and Y as the lipid abundance matrix. We also
# check that the dimensions are correct and match:
  
data(nutrimouse)
X <- nutrimouse$gene
Y <- nutrimouse$lipid
dim(X); dim(Y)
[1]  40 120
[1] 40 21
MyResult.pls <- pls(X,Y, ncomp=10)

plotIndiv(MyResult.pls)

plotVar(MyResult.pls)

plotIndiv(MyResult.pls)

plotVar(MyResult.pls)

plotIndiv(MyResult.pls, group = nutrimouse$genotype,
          rep.space = "XY-variate", legend = TRUE,
          legend.title = 'Genotype',
          ind.names = nutrimouse$diet,
          title = 'Nutrimouse: PLS')

# Here we change the size of the labels
plotVar(MyResult.pls, cex=c(3,2), legend = TRUE)

# A cut-off can be set to display only the variables that mostly contribute to the definition of each component.
plotVar(MyResult.pls, cutoff=0.5)

my.vip <- sort(vip(MyResult.pls)[,1],decreasing = TRUE)
barplot(my.vip[1:50],
        beside = FALSE,
        ylim = c(0, max(my.vip)), legend = rownames(my.vip)[1:50],
        main = "Variable Importance in the Projection", font.main = 4)

# The loading plots help visualise the coefficients assigned to each selected variable on each component:
plotLoadings(MyResult.pls, comp = 1, size.name = rel(0.5))

# We run a PLS model with a sufficient number of components first, then run perf on the
# object.
MyResult.pls <- pls(X,Y, ncomp = 6)
set.seed(30) # for reproducbility
perf.pls <- perf(MyResult.pls, validation = "Mfold", folds = 5,
                 progressBar = FALSE, nrepeat = 10)





plot(perf.pls$Q2.total)
abline(h = 0.0975)